library(tidyverse)Solution to dplyr tasks
1 Get started
2 Data transformation with dplyr
Find all penguins that …
- … have a bill length between 40 and 45 mm.
filter(penguins, between(bill_len, 40, 45))Error in `filter()`:
ℹ In argument: `between(bill_len, 40, 45)`.
Caused by error:
! object 'bill_len' not found
# same as
# filter(penguins, bill_len < 45 & bill_len > 40)- … for which we know the sex.
filter(penguins, !is.na(sex))# A tibble: 333 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
7 Adelie Torgersen 39.2 19.6 195 4675
8 Adelie Torgersen 41.1 17.6 182 3200
9 Adelie Torgersen 38.6 21.2 191 3800
10 Adelie Torgersen 34.6 21.1 198 4400
# ℹ 323 more rows
# ℹ 2 more variables: sex <fct>, year <int>
- … which are of the species Adelie or Gentoo
filter(penguins, species %in% c("Adelie", "Gentoo")
# or
# filter(penguins, (species == "Adelie" | species == "Gentoo"))Error in parse(text = input): <text>:4:0: unexpected end of input
2: # or
3: # filter(penguins, (species == "Adelie" | species == "Gentoo"))
^
- … lived on the island Dream in the year 2007. How many of them were from each of the 3 species?
filter(penguins, island == "Dream" & year == 2007) |>
count(species)# A tibble: 2 × 2
species n
<fct> <int>
1 Adelie 20
2 Chinstrap 26
Count …
- … the number of penguins on each island.
count(penguins, island)# A tibble: 3 × 2
island n
<fct> <int>
1 Biscoe 168
2 Dream 124
3 Torgersen 52
- … the number of penguins of each species on each island.
count(penguins, island, species)# A tibble: 5 × 3
island species n
<fct> <fct> <int>
1 Biscoe Adelie 44
2 Biscoe Gentoo 124
3 Dream Adelie 56
4 Dream Chinstrap 68
5 Torgersen Adelie 52
Select …
- … only the variables species, sex and year
select(penguins, species, sex, year)# A tibble: 344 × 3
species sex year
<fct> <fct> <int>
1 Adelie male 2007
2 Adelie female 2007
3 Adelie female 2007
4 Adelie <NA> 2007
5 Adelie female 2007
6 Adelie male 2007
7 Adelie female 2007
8 Adelie male 2007
9 Adelie <NA> 2007
10 Adelie <NA> 2007
# ℹ 334 more rows
- … only columns that contain measurements in mm
select(penguins, ends_with("mm"))# A tibble: 344 × 3
bill_length_mm bill_depth_mm flipper_length_mm
<dbl> <dbl> <int>
1 39.1 18.7 181
2 39.5 17.4 186
3 40.3 18 195
4 NA NA NA
5 36.7 19.3 193
6 39.3 20.6 190
7 38.9 17.8 181
8 39.2 19.6 195
9 34.1 18.1 193
10 42 20.2 190
# ℹ 334 more rows
# or
#select(penguins, contains("_mm"))Add a column …
- … with the ratio of bill length to bill depth
mutate(penguins,
ratio = bill_len / bill_dep)Error in `mutate()`:
ℹ In argument: `ratio = bill_len/bill_dep`.
Caused by error:
! object 'bill_len' not found
- … with abbreviations for the species (Adelie = A, Gentoo = G, Chinstrap = C).
mutate(penguins,
species_short = case_when(
species == "Adelie" ~ "A",
species == "Gentoo" ~ "G",
species == "Chinstrap" ~ "C"
))# A tibble: 344 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, species_short <chr>
Calculate …
- … mean flipper length and body mass for the 3 species and male and female penguins separately
penguins |>
summarize(
mean_flipper = mean(flipper_len, na.rm = TRUE),
mean_body = mean(body_mass, na.rm = TRUE),
.by = c(species, sex)
) Error in `summarize()`:
ℹ In argument: `mean_flipper = mean(flipper_len, na.rm = TRUE)`.
ℹ In group 1: `species = Adelie` `sex = male`.
Caused by error:
! object 'flipper_len' not found
- Can you do the same but remove the penguins for which we don’t know the sex first?
penguins |>
filter(!is.na(sex)) |>
summarize(
mean_flipper = mean(flipper_len, na.rm = TRUE),
mean_body = mean(body_mass, na.rm = TRUE),
.by = c(species, sex)
)Error in `summarize()`:
ℹ In argument: `mean_flipper = mean(flipper_len, na.rm = TRUE)`.
ℹ In group 1: `species = Adelie` `sex = male`.
Caused by error:
! object 'flipper_len' not found
3 For the fast ones
- Make a boxplot of penguin body mass with sex on the y-axis and facets for the different species. Can you remove the penguins with missing values for sex first?
penguins |>
filter(!is.na(sex)) |>
ggplot(aes(x = sex, y = body_mass)) +
geom_boxplot() +
facet_wrap(~species)Error in `geom_boxplot()`:
! Problem while computing aesthetics.
ℹ Error occurred in the 1st layer.
Caused by error:
! object 'body_mass' not found
- Make a scatterplot with the ratio of bill length to bill depth on the y axis and flipper length on the x axis? Can you distinguish the point between male and female penguins and remove penguins with unknown sex before making the plot?
penguins |>
mutate(ratio = bill_len / bill_dep) |>
filter(!is.na(sex)) |>
ggplot(aes(x = flipper_len, y = ratio, color = sex)) +
geom_point() +
scale_color_manual(values = c("cyan4", "darkorange")) +
labs(
x = "Flipper lenght (mm)",
y = "Ratio bill length / bill depth (-)"
) +
theme_minimal()Error in `mutate()`:
ℹ In argument: `ratio = bill_len/bill_dep`.
Caused by error:
! object 'bill_len' not found